library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

In this deliverable, I will explain how one might use MLE and MM to model (a) Glycohemoglobin and (b) Height of adult females. The data will be from National Health and Nutrition Examination Survey 2009-2010 (NHANES), available from the Hmisc package. I will compare and contrast the two methods in addition to comparing and contrasting the choice of underlying distribution. #Data

Hmisc::getHdata(nhgh)
d1 <- nhgh %>% 
  filter(sex == "female") %>% 
  filter(age >= 18) %>% 
  select(gh, ht) %>% 
  filter(1:n()<=1000)

a)Glycohemoglobin

Method of Moments (MM)

Normal Distribution

The first setp is to estimates the parameter. We can use mean function to find the mean of Glycohemoglobin, and sd function for the standard diviation of Glycohemoglobin.

gh<-d1$gh
#Estimates of parameters
mm.norm.mean=mean(gh)
mm.norm.sd=sd(gh)
mm.norm.mean
## [1] 5.7246
mm.norm.sd
## [1] 1.052246

Overlay estimated pdf onto histogram

Now we will look for overlay estimated pdf onto histogram. hist() function can be used for histogram. For the estimated PDF, we use curve function based on dnorm() expression. While the estimated pdf has a similar tendency as the histogram has, the highest point of the estimated pdf does not match the histogram. From the graph, Estimated pdf also doesn’t match the pdf well.

# Overlay estimated pdf onto histogram
hist(gh,main="Overlay Glycohemoglobin estimated pdf onto histogram",xlab="GH",breaks=100,freq=F)
curve(dnorm(x, mm.norm.mean,mm.norm.sd),add=T,lwd=1)

## Overlay estimated CDF onto eCDF Now we are looking for the graph to overlay estimated CDF onto eCDF. For the estimated CDF, we use plot function based on pnorm() expression, marked as red color. For eCDF, we use ecdf to reflect gh data in to the graph. From the graph,the estimated CDF has a similar tendency with eCDF, the red estimated CDF is right of the eCDF. the estimated CDF does not match eCDF well.

# Overlay estimated CDF onto eCDF
#eCDF
plot(ecdf(gh),main  = "Overlay Glycohemoglobin estimated CDF onto eCDF ",ylab="CDF",xlab="GH",col="orange")
#estimated CDF
curve(pnorm(x, mm.norm.mean,mm.norm.sd),add=TRUE,col="red")
legend("bottomright",c("eCDF","estimated CDF"), col = c("orange","red"), lwd=1)

# QQ plot (sample vs estimated distribution) Now we are going to create a QQ plot.I use quantile()function for the sample quantile and qnorm() for the theoretical quantile.We can see here that the plotted points do not match the line y=x. The sample distribution does not match with the theoretical distribution.

# QQ plot (sample vs estimated dist)
sample=quantile(gh,ppoints(1000))
theore=qnorm(ppoints(1000),mm.norm.mean,mm.norm.sd)
plot(theore,sample,main="QQ plot (sample vs estimated dist)(MM)",ylab="sample quantile",xlab="theoretical quantile")
abline(0,1)

# Estimated Median For the estimated median, we look for qnorm() at 0.5.The result is 5.7246

# Estimated Median
qnorm(.5,mm.norm.mean,mm.norm.sd)
## [1] 5.7246

Median Samp Distribution (hist)

To generate a sample histogram, I use simulation method for an outcome, which the sample size is 1000 and simulation times are 10000.hist() expression can daw a histogram for the median sample distribution.

# simulation Sample
sample.size <- 1000
sample.times<- 10000
simulate.sample <- array(rnorm(sample.size*sample.times,mm.norm.mean,mm.norm.sd ), c(sample.times,sample.size))
sample <- apply(simulate.sample,1,median)
# Median Sample Dist (hist)
hist(sample,xlab = "Median Sample",main="Median Samp Distribution (hist)(MM)",breaks = 100)

Range of middle 95% of Samp Dist

The range of middle 95% of sample distribution will covered from 2.5% quantile to 97.5%.

# Range of middle 95% of Sample Dist
quantile(sample, c(0.05/2, 1 - 0.05/2))
##     2.5%    97.5% 
## 5.641834 5.805573

Gamma Distribution

The first setp is to estimates the parameter. We can find the shape of Glycohemoglobin, and scale of Glycohemoglobin.

gh<-d1$gh
#Estimates of parameters
gh<-d1$gh
mm.gamma.shape=mean(gh)^2/var(gh)
mm.gamma.scale=var(gh)/mean(gh)
#Shape
mm.gamma.shape
## [1] 29.59754
#Scale
mm.gamma.scale
## [1] 0.1934147

Overlay estimated pdf onto histogram

Now we will look for overlay estimated pdf onto histogram. hist() function can be used for histogram. For the estimated PDF, we use curve function based on dgamma() expression. While the estimated pdf has a similar tendency as the histogram has, the highest point of the estimated pdf does not match the histogram. From the graph, Estimated pdf also doesn’t match the pdf well.

# Overlay estimated pdf onto histogram
hist(gh,main="Overlay Glycohemoglobin estimated pdf onto histogram",xlab="GH",breaks=100,freq=F)
curve(dgamma(x,shape=mm.gamma.shape,scale=mm.gamma.scale),add=TRUE,lwd=1)

# Overlay estimated CDF onto eCDF Now we are looking for the graph to overlay estimated CDF onto eCDF. For the estimated CDF, we use plot function based on pgamma() expression, marked as red color. For eCDF, we use ecdf to reflect gh data in to the graph. From the graph,the estimated CDF has a similar tendency with eCDF, the red estimated CDF is right of the eCDF. the estimated CDF does not match eCDF well.

# Overlay estimated CDF onto eCDF
#eCDF
plot(ecdf(gh),main  = "Overlay Glycohemoglobin estimated CDF onto eCDF ",ylab="CDF",xlab="GH",col="orange")
#estimated CDF
curve(pgamma(x,shape=mm.gamma.shape,scale=mm.gamma.scale),add=TRUE,col="red")
legend("bottomright",c("eCDF","estimated CDF"), col = c("orange","red"), lwd=1)

# QQ plot (sample vs estimated distribution) Now we are going to create a QQ plot.I use quantile()function for the sample quantile and qgamma() for the theoretical quantile.We can see here that the plotted points do not match the line y=x. The sample distribution does not match with the theoretical distribution.

# QQ plot (sample vs estimated dist)
sample=quantile(gh,ppoints(1000))
theore=qgamma(ppoints(1000),shape=mm.gamma.shape,scale=mm.gamma.scale)
plot(theore,sample,main="QQ plot (sample vs estimated dist)(MM)",ylab="sample quantile",xlab="theoretical quantile")
abline(0,1)

# Estimated Median For the estimated median, we look for qgamma() at 0.5

# Estimated Median
qgamma(0.5,shape=mm.gamma.shape,scale=mm.gamma.scale)
## [1] 5.660259

Median Samp Distribution (hist)

To generate a sample histogram, I use simulation method for an outcome, which the sample size is 1000 and simulation times are 10000.hist() expression can draw a histogram for the median sample distribution.

# simulation Sample
sample.size <- 1000
sample.times<- 10000
simulate.sample <- array(rgamma(sample.size*sample.times,shape=mm.gamma.shape,scale=mm.gamma.scale ), c(sample.times,sample.size))
sample <- apply(simulate.sample,1,median)
# Median Sample Dist (hist)
hist(sample,xlab = "Median Sample",main="Median Samp Distribution (hist)(MM)",breaks = 100)

Range of middle 95% of Samp Dist

The range of middle 95% of sample distribution will covered from 2.5% quantile to 97.5%.

# Range of middle 95% of Sample Dist
quantile(sample, c(0.05/2, 1 - 0.05/2))
##     2.5%    97.5% 
## 5.581732 5.742241

Weibull Distribution

lambda and k are two parameters here. To find these parameters, we need have some functions first. First, we have the lambda function here. Based on the function \(V[x]=(E[X]/\Gamma(1 + 1/k)^2 [\Gamma(1 + 2/k) - (\Gamma(1 + 1/k))^2]\), we can also have the variance function. To find the k, we need opytimize the function to find the minimum.

gh<-d1$gh
# Estimates of parameters
lambda=function(samp_mean,k){
  samp_mean/gamma(1+1/k)
}
var_weib=function(k,samp_mean,samp_var){
  lambda(samp_mean,k)^2*(gamma(1+2/k)-(gamma(1+1/k))^2)-samp_var
}
#optimization
mm.opt <- optimize(f = function(x) {
    abs(var_weib(k = x, samp_mean = mean(gh),samp_var = var(gh)))
    },  lower = 5, upper = 10000)

#MM weibull k estimate
weib.k <- mm.opt$minimum
weib.k
## [1] 6.353162
#MM weibull lambda estimate
weib.lambda <- lambda(samp_mean = mean(gh), k = weib.k)
weib.lambda
## [1] 6.151309

Overlay estimated pdf onto histogram

Now we will look for overlay estimated pdf onto histogram. hist() function can be used for histogram. For the estimated PDF, we use curve function based on dweibull expression. The sahpe here equals weibull k, and the scale here is weibull.lambda. While the estimated pdf has a similar tendency as the histogram has, the highest point of the estimated pdf does not match the histogram. From the graph, Estimated pdf also doesn’t match the pdf well.

# Overlay estimated pdf onto histogram
hist(gh,main="Overlay Glycohemoglobin estimated pdf onto histogram",xlab="GH",breaks=100,freq=F)
curve(dweibull(x,shape=weib.k,scale=weib.lambda),add=TRUE,lwd=1)

# Overlay estimated CDF onto eCDF Now we are looking for the graph to overlay estimated CDF onto eCDF. For the estimated CDF, we use plot function based on pweibull expression, marked as red color. For eCDF, we use ecdf to reflect gh data in to the graph. From the graph,the estimated CDF has a similar tendency with eCDF, the red estimated CDF is right of the eCDF. the estimated CDF does not match eCDF well.

# Overlay estimated CDF onto eCDF
#eCDF
plot(ecdf(gh),main  = "Overlay Glycohemoglobin estimated CDF onto eCDF ",ylab="CDF",xlab="GH",col="orange")
#estimated CDF
curve(pweibull(x,shape=weib.k,scale=weib.lambda),add=TRUE,col="red")
legend("bottomright",c("eCDF","estimated CDF"), col = c("orange","red"), lwd=1)

# QQ plot (sample vs estimated distribution) Now we are going to create a QQ plot.I use quantile()function for the sample quantile and qweibull() for the theoretical quantile.We can see here that the plotted points do not match the line y=x. The sample distribution does not match with the theoretical distribution.

# QQ plot (sample vs estimated dist)
sample=quantile(gh,ppoints(1000))
theore=qweibull(ppoints(1000),shape=weib.k,scale=weib.lambda)
plot(theore,sample,main="QQ plot (sample vs estimated dist)(MM)",ylab="sample quantile",xlab="theoretical quantile")
abline(0,1)

# Estimated Median For the estimated median, we look for qweibull() at 0.5

# Estimated Median
qweibull(0.5,shape=weib.k,scale=weib.lambda)
## [1] 5.806483

Median Samp Distribution (hist)

To generate a sample histogram, I use simulation method for an outcome, which the sample size is 1000 and simulation times are 10000.hist() expression can draw a histogram for the median sample distribution.

# simulation Sample
sample.size <- 1000
sample.times<- 10000
simulate.sample <- array(rweibull(sample.size*sample.times,shape=weib.k,scale=weib.lambda), c(sample.times,sample.size))
sample <- apply(simulate.sample,1,median)
# Median Sample Dist (hist)
hist(sample,xlab = "Median Sample",main="Median Samp Distribution (hist)(MM)",breaks = 100)

Range of middle 95% of Samp Dist

The range of middle 95% of sample distribution will covered from 2.5% quantile to 97.5%.

# Range of middle 95% of Sample Dist
quantile(sample, c(0.05/2, 1 - 0.05/2))
##     2.5%    97.5% 
## 5.725211 5.886335

Maximum likelihood (MLE)

Normal Distribution

For performing MLE, a function has created for using the normal distribution and its parameters.

require(stats4)
## Loading required package: stats4
nLL <- function(mean, sd){
  fs <- dnorm(
        x = gh
      , mean = mean
      , sd = sd
      , log = TRUE
    ) 
  -sum(fs)
}
gh.mle <- mle(
    nLL
  , start = list(mean = 5, sd = 1)
  , method = "L-BFGS-B"
  , lower = c(0, 0.01)
)
par(mfrow = c(1,2)); plot(profile(gh.mle), absVal = FALSE)

# Estimates of parameters We can use coef() function to find mean and sd here.

#Estimates of parameters
coef(gh.mle)
##     mean       sd 
## 5.724615 1.051720

Overlay estimated pdf onto histogram

Now we will look for overlay estimated pdf onto histogram. hist() function can be used for histogram. For the estimated PDF, we use curve function based on dnorm() expression. While the estimated pdf has a similar tendency as the histogram has, the highest point of the estimated pdf does not match the histogram. From the graph, Estimated pdf also doesn’t match the pdf well.Result is same with MM.

# Overlay estimated pdf onto histogram
hist(gh,main = "Overlay estimated pdf onto histogram (MLE)",col="white",xlab="GH",freq = FALSE,breaks=50)
curve(dnorm(x, coef(gh.mle)[1], coef(gh.mle)[2]), add = T)

Overlay estimated CDF onto eCDF

Now we are looking for the graph to overlay estimated CDF onto eCDF. For the estimated CDF, we use plot function based on pnorm() expression, marked as red color. For eCDF, we use ecdf to reflect gh data in to the graph. From the graph, hile, the estimated CDF has a similar tendency with eCDF, the red estimated CDF is right of the eCDF. the estimated CDF does not match eCDF well. Result is same with MM.

# Overlay estimated CDF onto eCDF
plot(ecdf(gh), main = "Overlay estimated CDF onto eCDF (MLE)",xlab="GH",ylab="CDF",col="orange")
curve(pnorm(x, coef(gh.mle)[1], coef(gh.mle)[2]), add = T,col="red")
legend("bottomright",c("eCDF","estimated CDF"), col = c("orange","red"), lwd=1)

QQ plot (sample vs estimated dist)

Now we are going to create a QQ plot.I use quantile()function for the sample quantile and qnorm() for the theoretical quantile.We can see here that the plotted points do not match the line y=x. The sample distribution does not match with the theoretical distribution.

# QQ plot (sample vs estimated dist)(mle)
sample=quantile(gh,ppoints(1000))
theore=qnorm(ppoints(1000),coef(gh.mle)[1],coef(gh.mle)[2])
plot(theore,sample,pch=10,main="QQ plot (sample vs estimated dist)(MLE)",ylab="sample quantile",xlab="theoretical quantile")
abline(0,1)

Estimated Median

For the estimated median, we look for qnorm() at 0.5.The result is 5.724615

# Estimated Median
qnorm(.5,mean = coef(gh.mle)[1],sd=coef(gh.mle)[2])
## [1] 5.724615

Median Samp Dist (hist)

To generate a sample histogram, I use simulation method for an outcome, which the sample size is 1000 and simulation times are 10000.hist() expression can daw a histogram for the median sample distribution.

# Median Samp Dist (hist)
sample.size <- 1000
sample.times<- 10000
simulate.sample <- array(rnorm(sample.size*sample.times,mean =coef(gh.mle)[1],sd=coef(gh.mle)[2] ), c(sample.times,sample.size))
sample <- apply(simulate.sample,1,median)

hist(sample,xlab = "Median Sample",main="Median Samp Dist (hist)(MLE)",breaks = 100)

Range of middle 95% of Samp Dist

The range of middle 95% of sample distribution will covered from 2.5% quantile to 97.5%.

# Range of middle 95% of Sample Dist
quantile(sample, c(0.05/2, 1 - 0.05/2))
##     2.5%    97.5% 
## 5.641783 5.804954

Gamma Distribution

For performing MLE, a function has created for using the normal distribution and its parameters.

require(stats4)
nLL <- function(shape,scale){
  fs <- dgamma(
        x = gh
      , shape = shape
      , scale = scale
      , log = TRUE
    ) 
  -sum(fs)
}
gh.mle <- mle(
    nLL
  , start = list(shape = 1, scale = 1)
  , method = "L-BFGS-B"
  , lower = c(0, 0.01)
)
par(mfrow = c(1,2)); plot(profile(gh.mle), absVal = FALSE)

# Estimates of parameters We can use coef() function to find shpe and scale here.

#Estimates of parameters
coef(gh.mle)
##      shape      scale 
## 40.7065049  0.1406358

a)Height of adult females

Method of Moments (MM)

Normal Distribution

The first setp is to estimates the parameter. We can use mean function to find the mean of Glycohemoglobin, and sd function for the standard diviation of Glycohemoglobin.

ht<-d1$ht
#Estimates of parameters
mm.norm.mean=mean(ht)
mm.norm.sd=sd(ht)
mm.norm.mean
## [1] 160.7419
mm.norm.sd
## [1] 7.320161

Overlay estimated pdf onto histogram

Now we will look for overlay estimated pdf onto histogram. hist() function can be used for histogram. For the estimated PDF, we use curve function based on dnorm() expression. While the estimated pdf has a similar tendency as the histogram has, the highest point of the estimated pdf match the histogram. From the graph, Estimated pdf matches the pdf well.

# Overlay estimated pdf onto histogram
hist(ht,main="Overlay Height of adult females estimated pdf onto histogram",xlab="GH",breaks=100,freq=F)
curve(dnorm(x, mm.norm.mean,mm.norm.sd),add=T,lwd=1)

# Overlay estimated CDF onto eCDF Now we are looking for the graph to overlay estimated CDF onto eCDF. For the estimated CDF, we use plot function based on pnorm() expression, marked as red color. For eCDF, we use ecdf to reflect gh data in to the graph. From the graph,the estimated CDF has a similar tendency with eCDF, the red estimated CDF jut over the eCDF. the estimated CDF match with eCDF well.

# Overlay estimated CDF onto eCDF
#eCDF
plot(ecdf(ht),main  = "Overlay Height of adult females estimated CDF onto eCDF ",ylab="CDF",xlab="GH",col="orange")
#estimated CDF
curve(pnorm(x, mm.norm.mean,mm.norm.sd),add=TRUE,col="red")
legend("bottomright",c("eCDF","estimated CDF"), col = c("orange","red"), lwd=1)

# QQ plot (sample vs estimated distribution) Now we are going to create a QQ plot.I use quantile()function for the sample quantile and qnorm() for the theoretical quantile.We can see here that the plotted points match the line y=x. The sample distribution match with the theoretical distribution.

# QQ plot (sample vs estimated dist)
sample=quantile(ht,ppoints(1000))
theore=qnorm(ppoints(1000),mm.norm.mean,mm.norm.sd)
plot(theore,sample,main="QQ plot (sample vs estimated dist)(MM)",ylab="sample quantile",xlab="theoretical quantile")
abline(0,1)

# Estimated Median For the estimated median, we look for qnorm() at 0.5.

# Estimated Median
qnorm(.5,mm.norm.mean,mm.norm.sd)
## [1] 160.7419

Median Samp Distribution (hist)

To generate a sample histogram, I use simulation method for an outcome, which the sample size is 1000 and simulation times are 10000.hist() expression can daw a histogram for the median sample distribution.

# simulation Sample
sample.size <- 1000
sample.times<- 10000
simulate.sample <- array(rnorm(sample.size*sample.times,mm.norm.mean,mm.norm.sd ), c(sample.times,sample.size))
sample <- apply(simulate.sample,1,median)
# Median Sample Dist (hist)
hist(sample,xlab = "Median Sample",main="Median Samp Distribution (hist)(MM)",breaks = 100)

Range of middle 95% of Samp Dist

The range of middle 95% of sample distribution will covered from 2.5% quantile to 97.5%.

# Range of middle 95% of Sample Dist
quantile(sample, c(0.05/2, 1 - 0.05/2))
##     2.5%    97.5% 
## 160.1706 161.3047

Gamma Distribution

The first setp is to estimates the parameter. We can find the shape of Glycohemoglobin, and scale of Glycohemoglobin.

ht<-d1$ht
#Estimates of parameters
ht<-d1$ht
mm.gamma.shape=mean(ht)^2/var(ht)
mm.gamma.scale=var(ht)/mean(ht)
#Shape
mm.gamma.shape
## [1] 482.1886
#Scale
mm.gamma.scale
## [1] 0.333359

Overlay estimated pdf onto histogram

Now we will look for overlay estimated pdf onto histogram. hist() function can be used for histogram. For the estimated PDF, we use curve function based on dgamma() expression. The estimated pdf has a similar tendency as the histogram has, and the highest point of the estimated pdf match the histogram. From the graph, Estimated pdf also match the pdf well.

# Overlay estimated pdf onto histogram
hist(ht,main="Overlay Glycohemoglobin estimated pdf onto histogram",xlab="GH",breaks=100,freq=F)
curve(dgamma(x,shape=mm.gamma.shape,scale=mm.gamma.scale),add=TRUE,lwd=1)

# Overlay estimated CDF onto eCDF Now we are looking for the graph to overlay estimated CDF onto eCDF. For the estimated CDF, we use plot function based on pgamma() expression, marked as red color. For eCDF, we use ecdf to reflect gh data in to the graph. From the graph,the estimated CDF has a similar tendency with eCDF, the red estimated CDF is over the eCDF. the estimated CDF match eCDF well.

# Overlay estimated CDF onto eCDF
#eCDF
plot(ecdf(ht),main  = "Overlay Glycohemoglobin estimated CDF onto eCDF ",ylab="CDF",xlab="GH",col="orange")
#estimated CDF
curve(pgamma(x,shape=mm.gamma.shape,scale=mm.gamma.scale),add=TRUE,col="red")
legend("bottomright",c("eCDF","estimated CDF"), col = c("orange","red"), lwd=1)

# QQ plot (sample vs estimated distribution) Now we are going to create a QQ plot.I use quantile()function for the sample quantile and qgamma() for the theoretical quantile.We can see here that the plotted points match the line y=x. The sample distribution match with the theoretical distribution.

# QQ plot (sample vs estimated dist)
sample=quantile(ht,ppoints(1000))
theore=qgamma(ppoints(1000),shape=mm.gamma.shape,scale=mm.gamma.scale)
plot(theore,sample,main="QQ plot (sample vs estimated dist)(MM)",ylab="sample quantile",xlab="theoretical quantile")
abline(0,1)

# Estimated Median For the estimated median, we look for qgamma() at 0.5

# Estimated Median
qgamma(0.5,shape=mm.gamma.shape,scale=mm.gamma.scale)
## [1] 160.6308

Median Samp Distribution (hist)

To generate a sample histogram, I use simulation method for an outcome, which the sample size is 1000 and simulation times are 10000.hist() expression can draw a histogram for the median sample distribution.

# simulation Sample
sample.size <- 1000
sample.times<- 10000
simulate.sample <- array(rgamma(sample.size*sample.times,shape=mm.gamma.shape,scale=mm.gamma.scale ), c(sample.times,sample.size))
sample <- apply(simulate.sample,1,median)
# Median Sample Dist (hist)
hist(sample,xlab = "Median Sample",main="Median Samp Distribution (hist)(MM)",breaks = 100)

Range of middle 95% of Samp Dist

The range of middle 95% of sample distribution will covered from 2.5% quantile to 97.5%.

# Range of middle 95% of Sample Dist
quantile(sample, c(0.05/2, 1 - 0.05/2))
##     2.5%    97.5% 
## 160.0568 161.1997

Weibull Distribution

lambda and k are two parameters here. To find these parameters, we need have some functions first. First, we have the lambda function here. Based on the function \(V[x]=(E[X]/\Gamma(1 + 1/k)^2 [\Gamma(1 + 2/k) - (\Gamma(1 + 1/k))^2]\), we can also have the variance function. To find the k, we need opytimize the function to find the minimum.

ht<-d1$ht
# Estimates of parameters
lambda=function(samp_mean,k){
  samp_mean/gamma(1+1/k)
}
var_weib=function(k,samp_mean,samp_var){
  lambda(samp_mean,k)^2*(gamma(1+2/k)-(gamma(1+1/k))^2)-samp_var
}
#optimization
mm.opt <- optimize(f = function(x) {
    abs(var_weib(k = x, samp_mean = mean(ht),samp_var = var(ht)))
    },  lower = 5, upper = 10000)

#MM weibull k estimate
weib.k <- mm.opt$minimum
weib.k
## [1] 27.45939
#MM weibull lambda estimate
weib.lambda <- lambda(samp_mean = mean(ht), k = weib.k)
weib.lambda
## [1] 163.9807

Overlay estimated pdf onto histogram

Now we will look for overlay estimated pdf onto histogram. hist() function can be used for histogram. For the estimated PDF, we use curve function based on dweibull expression. The shape here equals weibull k, and the scale here is weibull.lambda. While the estimated pdf has a similar tendency as the histogram has, the highest point of the estimated pdf match the histogram. From the graph, Estimated pdf also generally match the pdf well.

# Overlay estimated pdf onto histogram
hist(ht,main="Overlay Height of adult femalesestimated pdf onto histogram",xlab="HT",breaks=100,freq=F)
curve(dweibull(x,shape=weib.k,scale=weib.lambda),add=TRUE,lwd=1)

# Overlay estimated CDF onto eCDF Now we are looking for the graph to overlay estimated CDF onto eCDF. For the estimated CDF, we use plot function based on pweibull expression, marked as red color. For eCDF, we use ecdf to reflect gh data in to the graph. From the graph,the estimated CDF has a similar tendency with eCDF, the red estimated CDF is just over the eCDF. the estimated CDF matches eCDF well.

# Overlay estimated CDF onto eCDF
#eCDF
plot(ecdf(ht),main  = "Overlay Glycohemoglobin estimated CDF onto eCDF ",ylab="CDF",xlab="HT",col="orange")
#estimated CDF
curve(pweibull(x,shape=weib.k,scale=weib.lambda),add=TRUE,col="red")
legend("bottomright",c("eCDF","estimated CDF"), col = c("orange","red"), lwd=1)

# QQ plot (sample vs estimated distribution) Now we are going to create a QQ plot.I use quantile()function for the sample quantile and qweibull() for the theoretical quantile.We can see here that the plotted points generally matches the line y=x. The sample distribution generally match with the theoretical distribution.

# QQ plot (sample vs estimated dist)
sample=quantile(ht,ppoints(1000))
theore=qweibull(ppoints(1000),shape=weib.k,scale=weib.lambda)
plot(theore,sample,main="QQ plot (sample vs estimated dist)(MM)",ylab="sample quantile",xlab="theoretical quantile")
abline(0,1)

# Estimated Median For the estimated median, we look for qweibull() at 0.5

# Estimated Median
qweibull(0.5,shape=weib.k,scale=weib.lambda)
## [1] 161.8065

Median Samp Distribution (hist)

To generate a sample histogram, I use simulation method for an outcome, which the sample size is 1000 and simulation times are 10000.hist() expression can draw a histogram for the median sample distribution.

# simulation Sample
sample.size <- 1000
sample.times<- 10000
simulate.sample <- array(rweibull(sample.size*sample.times,shape=weib.k,scale=weib.lambda), c(sample.times,sample.size))
sample <- apply(simulate.sample,1,median)
# Median Sample Dist (hist)
hist(sample,xlab = "Median Sample",main="Median Samp Distribution (hist)(MM)",breaks = 100)

Range of middle 95% of Samp Dist

The range of middle 95% of sample distribution will covered from 2.5% quantile to 97.5%.

# Range of middle 95% of Sample Dist
quantile(sample, c(0.05/2, 1 - 0.05/2))
##     2.5%    97.5% 
## 161.2732 162.3242

Maximum likelihood (MLE)

Normal Distribution

For performing MLE, a function has created for using the normal distribution and its parameters.

require(stats4)
nLL <- function(mean, sd){
  fs <- dnorm(
        x = ht
      , mean = mean
      , sd = sd
      , log = TRUE
    ) 
  -sum(fs)
}
ht.mle <- mle(
    nLL
  , start = list(mean = 5, sd = 1)
  , method = "L-BFGS-B"
  , lower = c(0, 0.01)
)
par(mfrow = c(1,2)); plot(profile(ht.mle), absVal = FALSE)

# Estimates of parameters We can use coef() function to find mean and sd here.

#Estimates of parameters
coef(ht.mle)
##       mean         sd 
## 160.741911   7.316503

Overlay estimated pdf onto histogram

Now we will look for overlay estimated pdf onto histogram. hist() function can be used for histogram. For the estimated PDF, we use curve function based on dnorm() expression. While the estimated pdf has a similar tendency as the histogram has, the highest point of the estimated pdf matches the histogram. From the graph, Estimated pdf also matches the pdf well.Result is same with MM.

# Overlay estimated pdf onto histogram
hist(ht,main = "Overlay estimated pdf onto histogram (MLE)",col="white",xlab="HT",freq = FALSE,breaks=50)
curve(dnorm(x, coef(ht.mle)[1], coef(ht.mle)[2]), add = T)

Overlay estimated CDF onto eCDF

Now we are looking for the graph to overlay estimated CDF onto eCDF. For the estimated CDF, we use plot function based on pnorm() expression, marked as red color. For eCDF, we use ecdf to reflect gh data in to the graph. From the graph, the estimated CDF has a similar tendency with eCDF, the red estimated CDF is over the eCDF. The estimated CDF matches eCDF well. Result is same with MM.

# Overlay estimated CDF onto eCDF
plot(ecdf(ht), main = "Overlay estimated CDF onto eCDF (MLE)",xlab="HT",ylab="CDF",col="orange")
curve(pnorm(x, coef(ht.mle)[1], coef(ht.mle)[2]), add = T,col="red")
legend("bottomright",c("eCDF","estimated CDF"), col = c("orange","red"), lwd=1)

QQ plot (sample vs estimated dist)

Now we are going to create a QQ plot.I use quantile()function for the sample quantile and qnorm() for the theoretical quantile.We can see here that the plotted points matches the line y=x. The sample distribution matches with the theoretical distribution.

# QQ plot (sample vs estimated dist)(mle)
sample=quantile(ht,ppoints(1000))
theore=qnorm(ppoints(1000),coef(gh.mle)[1],coef(gh.mle)[2])
plot(theore,sample,pch=10,main="QQ plot (sample vs estimated dist)(MLE)",ylab="sample quantile",xlab="theoretical quantile")
abline(0,1)

Estimated Median

For the estimated median, we look for qnorm() at 0.5.

# Estimated Median
qnorm(.5,mean = coef(ht.mle)[1],sd=coef(ht.mle)[2])
## [1] 160.7419

Median Samp Dist (hist)

To generate a sample histogram, I use simulation method for an outcome, which the sample size is 1000 and simulation times are 10000.hist() expression can daw a histogram for the median sample distribution.

# Median Samp Dist (hist)
sample.size <- 1000
sample.times<- 10000
simulate.sample <- array(rnorm(sample.size*sample.times,mean =coef(ht.mle)[1],sd=coef(ht.mle)[2] ), c(sample.times,sample.size))
sample <- apply(simulate.sample,1,median)

hist(sample,xlab = "Median Sample",main="Median Samp Dist (hist)(MLE)",breaks = 100)

Range of middle 95% of Samp Dist

The range of middle 95% of sample distribution will covered from 2.5% quantile to 97.5%.

# Range of middle 95% of Sample Dist
quantile(sample, c(0.05/2, 1 - 0.05/2))
##     2.5%    97.5% 
## 160.1787 161.2979

Gamma Distribution

For performing MLE, a function has created for using the normal distribution and its parameters.

require(stats4)
nLL <- function(shape,scale){
  fs <- dgamma(
        x = ht
      , shape = shape
      , scale = scale
      , log = TRUE
    ) 
  -sum(fs)
}
ht.mle <- mle(
    nLL
  , start = list(shape = 1, scale = 1)
  , method = "L-BFGS-B"
  , lower = c(0, 0.01)
)
par(mfrow = c(1,2)); plot(profile(ht.mle), absVal = FALSE)

# Estimates of parameters We can use coef() function to find shpe and scale here.

#Estimates of parameters
coef(ht.mle)
##       shape       scale 
## 479.6126962   0.3351514

Overlay estimated pdf onto histogram

Now we will look for overlay estimated pdf onto histogram. hist() function can be used for histogram. For the estimated PDF, we use curve function based on dgamma() expression. While the estimated pdf has a similar tendency as the histogram has, the highest point of the estimated pdf matches the histogram. From the graph, Estimated pdf also matches the pdf well.Result is same with MM.

# Overlay estimated pdf onto histogram
hist(ht,main = "Overlay estimated pdf onto histogram (MLE)",col="white",xlab="HT",freq = FALSE,breaks=100)
curve(dgamma(x, shape=coef(ht.mle)[1], scale=coef(ht.mle)[2]), add = T)

Overlay estimated CDF onto eCDF

Now we are looking for the graph to overlay estimated CDF onto eCDF. For the estimated CDF, we use plot function based on pgamma() expression, marked as red color. For eCDF, we use ecdf to reflect gh data in to the graph. From the graph, the estimated CDF has a similar tendency with eCDF, the red estimated CDF is overthe eCDF. the estimated CDF matches eCDF well. Result is same with MM.

# Overlay estimated CDF onto eCDF
plot(ecdf(ht), main = "Overlay estimated CDF onto eCDF (MLE)",xlab="GH",ylab="CDF",col="orange")
curve(pgamma(x, shape=coef(ht.mle)[1], scale=coef(ht.mle)[2]), add = T,col="red")
legend("bottomright",c("eCDF","estimated CDF"), col = c("orange","red"), lwd=1)

QQ plot (sample vs estimated dist)

Now we are going to create a QQ plot.I use quantile()function for the sample quantile and qgamma() for the theoretical quantile.We can see here that the plotted points matches the line y=x. The sample distribution matches with the theoretical distribution.

# QQ plot (sample vs estimated dist)(mle)
sample=quantile(ht,ppoints(1000))
theore=qgamma(ppoints(1000),shape=coef(ht.mle)[1],scale=coef(ht.mle)[2])
plot(theore,sample,pch=10,main="QQ plot (sample vs estimated dist)(MLE)",ylab="sample quantile",xlab="theoretical quantile")
abline(0,1)

Estimated Median

For the estimated median, we look for qgamma() at 0.5.

# Estimated Median
qgamma(.5,shape = coef(ht.mle)[1],scale=coef(ht.mle)[2])
## [1] 160.6311

Median Samp Dist (hist)

To generate a sample histogram, I use simulation method and rgamma() function for an outcome, which the sample size is 1000 and simulation times are 10000. hist() expression can draw a histogram for the median sample distribution.

# Median Samp Dist (hist)
sample.size <- 1000
sample.times<- 10000
simulate.sample <- array(rgamma(sample.size*sample.times,shape =coef(ht.mle)[1],scale=coef(ht.mle)[2] ), c(sample.times,sample.size))
sample <- apply(simulate.sample,1,median)

hist(sample,xlab = "Median Sample",main="Median Samp Dist (hist)(MLE)",breaks = 100)

Range of middle 95% of Samp Dist

The range of middle 95% of sample distribution will covered from 2.5% quantile to 97.5%.

# Range of middle 95% of Sample Dist
quantile(sample, c(0.05/2, 1 - 0.05/2))
##     2.5%    97.5% 
## 160.0745 161.1996

Weibull Distribution

For performing MLE, a function has created for using the normal distribution and its parameters.

require(stats4)
nLL <- function(shape, scale){
  fs <- dweibull(
        x = ht
      , shape = shape
      , scale = scale
      , log = TRUE
    ) 
  -sum(fs)
}
ht.mle <- mle(
    nLL
  , start = list(shape = 1, scale = 1)
  , method = "L-BFGS-B"
  , lower = c(0, 0.01)
)
par(mfrow = c(1,2)); plot(profile(ht.mle), absVal = FALSE)

# Estimates of parameters We can use coef() function to find mean and sd here.

#Estimates of parameters
coef(ht.mle)
##     shape     scale 
##  21.85398 164.24719

Overlay estimated pdf onto histogram

Now we will look for overlay estimated pdf onto histogram. hist() function can be used for histogram. For the estimated PDF, we use curve function based on dweibull() expression. While the estimated pdf has a similar tendency as the histogram has, the highest point of the estimated pdf matches the histogram. From the graph, Estimated pdf also matches the pdf well.Result is same with MM.

# Overlay estimated pdf onto histogram
hist(ht,main = "Overlay estimated pdf onto histogram (MLE)",col="white",xlab="HT",freq = FALSE,breaks=50)
curve(dweibull(x, shape=coef(ht.mle)[1], scale=coef(ht.mle)[2]), add = T)

Overlay estimated CDF onto eCDF

Now we are looking for the graph to overlay estimated CDF onto eCDF. For the estimated CDF, we use plot function based on pweibull expression, marked as red color. For eCDF, we use ecdf to reflect gh data in to the graph. From the graph, the estimated CDF has a similar tendency with eCDF, the red estimated CDF is over the eCDF. the estimated CDF matchES eCDF well. Result is same with MM.

# Overlay estimated CDF onto eCDF
plot(ecdf(ht), main = "Overlay estimated CDF onto eCDF (MLE)",xlab="GH",ylab="CDF",col="orange")
curve(pweibull(x, coef(ht.mle)[1], coef(ht.mle)[2]), add = T,col="red")
legend("bottomright",c("eCDF","estimated CDF"), col = c("orange","red"), lwd=1)

QQ plot (sample vs estimated dist)

Now we are going to create a QQ plot.I use quantile()function for the sample quantile and qweibull() for the theoretical quantile.We can see here that the plotted points generally matches the line y=x. The sample distribution generally matches with the theoretical distribution.

# QQ plot (sample vs estimated dist)(mle)
sample=quantile(ht,ppoints(1000))
theore=qweibull(ppoints(1000),coef(ht.mle)[1],coef(ht.mle)[2])
plot(theore,sample,pch=10,main="QQ plot (sample vs estimated dist)(MLE)",ylab="sample quantile",xlab="theoretical quantile")
abline(0,1)

Estimated Median

For the estimated median, we look for qweibull() at 0.5.

# Estimated Median
qweibull(.5,shape = coef(ht.mle)[1],scale=coef(ht.mle)[2])
## [1] 161.5156

Median Samp Dist (hist)

To generate a sample histogram, I use simulation method and rweibull() function for an outcome, which the sample size is 1000 and simulation times are 10000.hist() expression can daw a histogram for the median sample distribution.

# Median Samp Dist (hist)
sample.size <- 1000
sample.times<- 10000
simulate.sample <- array(rweibull(sample.size*sample.times,shape =coef(ht.mle)[1],scale=coef(ht.mle)[2] ), c(sample.times,sample.size))
sample <- apply(simulate.sample,1,median)

hist(sample,xlab = "Median Sample",main="Median Samp Dist (hist)(MLE)",breaks = 100)

Range of middle 95% of Samp Dist

The range of middle 95% of sample distribution will covered from 2.5% quantile to 97.5%.

# Range of middle 95% of Sample Dist
quantile(sample, c(0.05/2, 1 - 0.05/2))
##     2.5%    97.5% 
## 160.8569 162.1708